Quast GC

genomeTable_full <- read.table("/media/harddrive/caseiGroup/04a_quality_control/genomeTable_full.tsv",header=T,sep="\t")
genomesClades <- read.table("/media/harddrive//caseiGroup/Trees/genomeTableWithClades.tsv")
genomeTable_full$clade <- genomesClades[which(genomeTable_full$Assembly%in%genomesClades$V1),"V3"]
genomeTable_full$species <- factor(genomeTable_full$species,levels=c("casei","paracasei","rhamnosus","zeae","sp","isolate"))

aggregate(genomeTable_full[,c('species','GC')], by=list(genomeTable_full$species), FUN=mean, na.rm=TRUE)
## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(X[[i]], ...): argument is not numeric or logical:
## returning NA
##     Group.1 species       GC
## 1     casei      NA 46.54972
## 2 paracasei      NA 46.29868
## 3 rhamnosus      NA 46.66859
## 4      zeae      NA 47.75000
## 5        sp      NA 46.76067
## 6   isolate      NA 48.02000
factor(genomeTable_full$clade,levels=c("cladeA","cladeB","cladeC"))
##   [1] cladeB cladeC cladeA cladeA cladeA cladeC cladeC cladeA cladeC cladeA
##  [11] cladeA cladeC cladeC cladeC cladeC cladeB cladeA cladeA cladeA cladeA
##  [21] cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeC cladeC
##  [31] cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA
##  [41] cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA cladeA
##  [51] cladeA cladeA cladeA cladeC cladeC cladeC cladeA cladeC cladeA cladeC
##  [61] cladeA cladeA cladeA cladeB cladeC cladeC cladeC cladeC cladeC cladeC
##  [71] cladeC cladeC cladeC cladeC cladeC cladeB cladeC cladeA cladeB cladeC
##  [81] cladeA cladeA cladeA cladeC cladeB cladeC cladeC cladeC cladeA cladeA
##  [91] cladeC cladeC cladeC cladeC cladeC cladeA cladeB cladeC cladeC cladeC
## [101] cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeA cladeA
## [111] cladeA cladeA cladeA cladeB cladeC cladeA cladeA cladeA cladeA cladeC
## [121] cladeA cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC
## [131] cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC
## [141] cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC
## [151] cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC cladeC
## [161] cladeB cladeC cladeC cladeC cladeC cladeC cladeA cladeC cladeC cladeC
## [171] cladeC cladeA cladeA cladeA cladeB cladeA cladeC cladeA cladeA cladeC
## [181] cladeC cladeC cladeC cladeC
## Levels: cladeA cladeB cladeC
library(ggplot2)
plot <- ggplot(genomeTable_full,aes(x=species,y=GC))
plot + geom_jitter(aes(colour=clade),width=0.3, height=0, alpha=0.8) + scale_color_manual(values=c('#e41a1c','#377eb8','#4daf4a')) + theme_bw() +
  scale_x_discrete(labels=c("casei" = "Lactobacillus casei", "isolate" = "AMBR2", "paracasei" = "Lactobacillus paracasei", "rhamnosus" = "Lactobacillus rhamnosus", "zeae" = "Lactobacillus zeae","sp" = "Lactobacillus sp.")) +
  theme(axis.text.x = element_text(face=c("bold.italic","bold.italic","bold.italic","bold.italic","bold.italic","bold"), angle=45, hjust=1,vjust=1)) +
  xlab("")

Read in data

library(stringr)
setwd("/media/harddrive/caseiGroup/04b_gc_content/outfiles")

file_list <- list.files('/media/harddrive/caseiGroup/04b_gc_content/outfiles')
 
for (file in file_list){
       
  # if the merged dataset doesn't exist, create it
  if (!exists("dataset")){
    dataset <- read.table(file, header=TRUE, sep="\t",quote="")
    dataset$Genome <- unlist(str_split(file,pattern='\\.'))[1] 
    } else {
    temp_dataset <-read.table(file, header=TRUE, sep="\t",quote="")
    temp_dataset$Genome <- unlist(str_split(file,pattern='\\.'))[1]
    dataset<-rbind(dataset, temp_dataset)
    rm(temp_dataset)
    
  }
}

saveRDS(dataset,file="/media/harddrive/caseiGroup/04b_gc_content/GC_content_per_gene_caseigroup.Robject")

Calculate meanGC

plot mean GC

setwd('/media/harddrive/caseiGroup/04b_gc_content/')
dataset <- readRDS(file='GC_content_per_gene_caseigroup.Robject')
meanGC <- aggregate(X.GC~Genome,data=dataset, mean)

# Add species
names(genomesClades) <- c("Genome","Source","clade")

meanGC <- merge(meanGC,genomesClades,all.x=T,all.y=T)

plot <- ggplot(meanGC,aes(x=clade,y=X.GC))
plot + geom_jitter(aes(colour=Source))  +theme_bw()

Weighted meanGC

weightedmeanGC <- sapply(split(dataset, dataset$Genome), function(d) weighted.mean(d$X.GC, w = d$Length))
weightedmeanGCdf <- as.data.frame(weightedmeanGC)
weightedmeanGCdf$Genome <- row.names(weightedmeanGCdf)
names(weightedmeanGCdf)[1] <- 'X.GC'

weightedmeanGCdf <- merge(weightedmeanGCdf,genomesClades,all.x=T,all.y = T)

plot <- ggplot(weightedmeanGCdf)
plot + geom_jitter(aes(x=clade,y=X.GC,colour=Source)) +
  theme_bw() +
  theme(axis.text.x = element_text(angle=45,vjust=1,hjust=1)) + 
  ylab("GC percentage coding region") +
  xlab(" ")

GC content next level

First let’s read in the phylogenetic tree to get the in order to be able to reorder the genomes according to their phylogenetic structure

library(ggtree)
## ggtree v1.6.11  For help: https://guangchuangyu.github.io/ggtree
## 
## If you use ggtree in published research, please cite:
## Guangchuang Yu, David Smith, Huachen Zhu, Yi Guan, Tommy Tsan-Yuk Lam. ggtree: an R package for visualization and annotation of phylogenetic trees with their covariates and other associated data. Methods in Ecology and Evolution 2017, 8(1):28-36, doi:10.1111/2041-210X.12628
library(ape)
## 
## Attaching package: 'ape'
## The following object is masked from 'package:ggtree':
## 
##     rotate
tree <- read.tree('/media/harddrive/caseiGroup/Trees/RAxML_bipartitions.caseigroup')

# Root tree
tree_r <- ape::root(tree, "outgroup")
# Remove outgroup
tree_r_wooutroup <- drop.tip(tree_r, "outgroup")

# Test ggtree
p <- ggtree(tree_r_wooutroup) +  geom_tree()
p

# Get the label order
d <- fortify(tree_r_wooutroup)
dd <- subset(d, isTip)
tree_order <- dd$label[order(dd$y,decreasing=T)]

Now lets’ go on:

dataset <- merge(dataset,genomesClades,all.x=T)
dataset$Genome <- as.factor(dataset$Genome)

At this point, we save the dataset so that Stijn can also use it for visualization and stuff on his local machine:

save(dataset, file = "GC_table_forStijn.Robject")

Plot of all GC content per gene of each genome as points

dataset$Genome_reorderd <- factor(dataset$Genome,levels=tree_order,ordered=T)
plot <- ggplot(dataset)
plot + geom_jitter(aes(x=X.GC, y=Genome_reorderd,colour=clade),width=0,size=0.1, alpha=0.1) +
  scale_color_manual(values=c("#a65628","#1b9e77", "#984ea3")) + theme_bw() +
  xlab("GC %") + ylab("") +
  theme(axis.text.y=element_text(size=4)) +
  guides(colour = guide_legend(override.aes = list(size=5,alpha=1)))

#dev.copy2pdf(file="/media/harddrive//caseiGroup/GC_content/GC_contentpergene.pdf",out.type="cairo", width=10, height=7.5)

Plot of all GC content per gene of each genome as densities

plot <- ggplot(dataset)
plot + geom_density(aes(X.GC,colour=clade, group=Genome), size=0.2,alpha=0.5) +
  scale_color_manual(values=c("#a65628","#1b9e77", "#984ea3")) + 
  theme_bw()

plot <- ggplot(dataset, aes(X.GC))
plot + geom_density(aes(colour=clade)) +
 geom_jitter(aes(y=-1*ifelse(clade=="cladeA",0.1,ifelse(clade=="cladeB",0.2,0.3)),x=X.GC,colour=clade),height=0.05, width=0,size=0.01)+
  scale_color_manual(values=c("#a65628","#1b9e77", "#984ea3")) +
  theme_bw() +
  scale_y_continuous(limits=c(-0.35,0.2),breaks=c(0,0.2))

Unimodality test

#library(diptest)
#dip(dataset[which(dataset$clade=="cladeC"),"X.GC"])

GC vs length

plot <- ggplot(dataset)
plot + geom_point(aes(x=X.GC,y=Length, colour=clade), size=0.2,alpha=0.5) +
  scale_color_manual(values=c("#a65628","#1b9e77", "#984ea3"))  + 
  facet_wrap(~clade) +
  theme_bw()

Cool! There are two very big genes in cladeB.

Clade B only

I do think I see some abnormality in the first two graphs above in clade B. So let’s only plot clade B.

dataset_cladeB <- dataset[which(dataset$clade=="cladeB"),]
plot <- ggplot(dataset_cladeB)
plot + geom_jitter(aes(x=X.GC, y=Genome_reorderd,colour=clade),width=0,size=0.1, alpha=0.1) +
  scale_color_manual(values=c("#1b9e77")) + theme_bw() +
  xlab("GC %") + ylab("") +
  theme(axis.text.y=element_text(size=15)) +
  guides(colour = guide_legend(override.aes = list(size=5,alpha=1)))

plot <- ggplot(dataset_cladeB)
plot + geom_density(aes(X.GC,colour=clade, group=Genome), size=0.2,alpha=0.5) +
  scale_color_manual(values=c("#1b9e77")) + 
  theme_bw()

Let’s look at genes with GC content between 52 -55

plot <- ggplot(dataset_cladeB[which(dataset_cladeB$X.GC>52 & dataset_cladeB$X.GC<55),])
plot + geom_density(aes(X.GC,colour=clade, group=Genome), size=0.2,alpha=0.5) +
  scale_color_manual(values=c("#1b9e77"))  +
  theme_bw()